Introduction

To investigate ride behavior differences between casual and member users and uncover temporal and spatial patterns in ride activity, a comprehensive and well-structured database is essential. The analysis focuses on understanding how ride patterns vary across time—daily, weekly, and seasonally—and space—stations and routes—while identifying trends in ride duration, station popularity, and overall demand. These insights are critical for guiding Divvy’s operational decisions and marketing strategies.

The source data for this project consists of 12 monthly Divvy trip datasets for the year 2024, containing ride-level information such as ride identifiers, timestamps, start and end stations, and user type (casual vs. member). To efficiently support analysis, a relational database will be designed to:

  • Consolidate the monthly datasets into a single, queryable structure.
  • Maintain data integrity with primary keys and appropriate data types for timestamps, text fields, and identifiers.
  • Enable temporal analysis by storing ride start and end times in a standardized timestamp format.
  • Support spatial analysis by including station names and IDs, allowing examination of station popularity and route patterns.
  • Facilitate user segmentation by distinguishing between casual and member riders.

By implementing this database, analysts will be able to efficiently query and aggregate data, uncover patterns in ride behavior, and generate actionable insights for Divvy’s operational planning and marketing initiatives.


Database Creation

Database Connection

# Read config
config <- read.ini("db_config.ini")
db <- config$postgresql

# Safe database connection
tryCatch({
  con <- dbConnect(
    Postgres(),
    host = db$host,
    dbname = db$database,
    user = db$user,
    password = db$password,
    port = as.integer(db$port)
  )
}, error = function(e) {
  stop("Database connection failed: ", e$message)
})

# Register connection for SQL chunks
knitr::opts_chunk$set(connection = con)

Prerequisites

Enable the pgcrypto extension for UUID generation

CREATE EXTENSION IF NOT EXISTS pgcrypto;

Enable the btree_gin extension for better composite indexing

CREATE EXTENSION IF NOT EXISTS btree_gin;

Schema & Base Tables

Create a schema to hold all Divvy tables

CREATE SCHEMA IF NOT EXISTS divvy;

Create monthly staging tables. Staging tables mirror the CSV columns exactly. They’re fast to load and easy to QA. We’ll later upsert into a normalized fact table.

months <- c("january","february","march","april","may","june",
             "july","august","september","october","november","december")

for (m in months) {
  sql <- glue("
    CREATE TABLE IF NOT EXISTS divvy.{m} (
      ride_id             TEXT PRIMARY KEY,
      rideable_type       TEXT,
      started_at          TIMESTAMP,
      ended_at            TIMESTAMP,
      start_station_name  TEXT,
      start_station_id    TEXT,
      end_station_name    TEXT,
      end_station_id      TEXT,
      member_casual       TEXT
    );
  ")
  # Drop old table if exists
  dbExecute(con, glue("DROP TABLE IF EXISTS divvy.{m} CASCADE;"))
  
  # Create new table
  dbExecute(con, sql)
}

Load the CSVs

# month name for a given numeric month
month_name <- function(m) tolower(
  format(as.Date(paste0("2024-", sprintf("%02d", m), "-01")), "%B"))

# loop and load
for (m in 1:12) {
  fname <- sprintf("resources/data/2024%02d-divvy-tripdata.csv", m)
  tbl   <- month_name(m)

  message("Loading: ", fname, " -> divvy.", tbl)
  df <- readr::read_csv(fname, show_col_types = FALSE)

  # select/rename only the columns we expect
  expect <- c("ride_id","rideable_type","started_at","ended_at",
              "start_station_name","start_station_id",
              "end_station_name","end_station_id","member_casual")
  df <- df[, intersect(expect, names(df))]

  DBI::dbWriteTable(
    con,
    name = DBI::Id(schema="divvy", table=tbl),
    value = df,
    append = TRUE,     # append to existing monthly table
    row.names = FALSE
  )
}

Normalize & Combine

We’ll build a normalized core: - dim_station: unique stations by station_id with most recent name seen (names can drift). - dim_date: calendar table for temporal joins. - dim_member_type, dim_bike_type: small lookup tables. - fact_trips: the single deduplicated table referencing dimensions.

Dimension tables

Station dimension

CREATE TABLE IF NOT EXISTS divvy.dim_station (
  station_id   TEXT PRIMARY KEY,
  station_name TEXT,
  latitude     DOUBLE PRECISION,
  longitude    DOUBLE PRECISION,
  updated_at   TIMESTAMP DEFAULT now()
);

Member type dimension

CREATE TABLE IF NOT EXISTS divvy.dim_member_type (
  member_type_id SMALLSERIAL PRIMARY KEY,
  member_casual  TEXT UNIQUE  -- 'member'/'casual'
);

Populate member type dimension with known values

INSERT INTO divvy.dim_member_type(member_casual)
VALUES ('member'),('casual')
ON CONFLICT (member_casual) DO NOTHING;

Bike type dimension

CREATE TABLE IF NOT EXISTS divvy.dim_bike_type (
  bike_type_id SMALLSERIAL PRIMARY KEY,
  rideable_type TEXT UNIQUE -- 'classic_bike','electric_bike','docked_bike', etc.
);

Populate dim_bike_type from staging

INSERT INTO divvy.dim_bike_type(rideable_type)
SELECT DISTINCT rideable_type
FROM (
  SELECT rideable_type FROM divvy.january
  UNION ALL SELECT rideable_type FROM divvy.february
  UNION ALL SELECT rideable_type FROM divvy.march
  UNION ALL SELECT rideable_type FROM divvy.april
  UNION ALL SELECT rideable_type FROM divvy.may
  UNION ALL SELECT rideable_type FROM divvy.june
  UNION ALL SELECT rideable_type FROM divvy.july
  UNION ALL SELECT rideable_type FROM divvy.august
  UNION ALL SELECT rideable_type FROM divvy.september
  UNION ALL SELECT rideable_type FROM divvy.october
  UNION ALL SELECT rideable_type FROM divvy.november
  UNION ALL SELECT rideable_type FROM divvy.december
) s
WHERE rideable_type IS NOT NULL
ON CONFLICT (rideable_type) DO NOTHING;

Build/refresh dim_station from both start and end stations

WITH stations AS (
  SELECT start_station_id AS station_id, max(start_station_name) AS station_name
  FROM (
    SELECT * FROM divvy.january
    UNION ALL SELECT * FROM divvy.february
    UNION ALL SELECT * FROM divvy.march
    UNION ALL SELECT * FROM divvy.april
    UNION ALL SELECT * FROM divvy.may
    UNION ALL SELECT * FROM divvy.june
    UNION ALL SELECT * FROM divvy.july
    UNION ALL SELECT * FROM divvy.august
    UNION ALL SELECT * FROM divvy.september
    UNION ALL SELECT * FROM divvy.october
    UNION ALL SELECT * FROM divvy.november
    UNION ALL SELECT * FROM divvy.december
  ) t
  WHERE start_station_id IS NOT NULL
  GROUP BY start_station_id
  UNION
  SELECT end_station_id AS station_id, max(end_station_name) AS station_name
  FROM (
    SELECT * FROM divvy.january
    UNION ALL SELECT * FROM divvy.february
    UNION ALL SELECT * FROM divvy.march
    UNION ALL SELECT * FROM divvy.april
    UNION ALL SELECT * FROM divvy.may
    UNION ALL SELECT * FROM divvy.june
    UNION ALL SELECT * FROM divvy.july
    UNION ALL SELECT * FROM divvy.august
    UNION ALL SELECT * FROM divvy.september
    UNION ALL SELECT * FROM divvy.october
    UNION ALL SELECT * FROM divvy.november
    UNION ALL SELECT * FROM divvy.december
  ) t
  WHERE end_station_id IS NOT NULL
  GROUP BY end_station_id
)
INSERT INTO divvy.dim_station(station_id, station_name)
SELECT DISTINCT ON (station_id) station_id, station_name
FROM stations
ORDER BY station_id, station_name
ON CONFLICT (station_id) DO UPDATE
SET station_name = EXCLUDED.station_name,
    updated_at   = now();

Date dimension

CREATE TABLE IF NOT EXISTS divvy.dim_date (
  date_id        DATE PRIMARY KEY,
  year           INT,
  quarter        INT,
  month          INT,
  week           INT,
  day            INT,
  day_of_week    INT,
  is_weekend     BOOLEAN
);

Populate for 2024

INSERT INTO divvy.dim_date(date_id, year, quarter, month, week, day, day_of_week, is_weekend)
SELECT d::date,
       EXTRACT(YEAR   FROM d)::int,
       EXTRACT(QUARTER FROM d)::int,
       EXTRACT(MONTH  FROM d)::int,
       EXTRACT(WEEK   FROM d)::int,
       EXTRACT(DAY    FROM d)::int,
       EXTRACT(DOW    FROM d)::int,
       (EXTRACT(DOW FROM d) IN (0,6))::boolean
FROM generate_series('2024-01-01'::date, '2024-12-31'::date, '1 day') AS s(d)
ON CONFLICT (date_id) DO NOTHING;

Fact table

A single deduplicated fact table referencing dimensions.

CREATE TABLE IF NOT EXISTS divvy.fact_trips (
  ride_id            TEXT PRIMARY KEY,
  bike_type_id       SMALLINT REFERENCES divvy.dim_bike_type(bike_type_id),
  member_type_id     SMALLINT REFERENCES divvy.dim_member_type(member_type_id),
  started_at         TIMESTAMP NOT NULL,
  ended_at           TIMESTAMP NOT NULL,
  start_station_id   TEXT REFERENCES divvy.dim_station(station_id),
  end_station_id     TEXT REFERENCES divvy.dim_station(station_id),

  -- Generated duration in minutes (kept in the fact for performance)
  ride_length_min    DOUBLE PRECISION GENERATED ALWAYS AS
                      (EXTRACT(EPOCH FROM (ended_at - started_at))/60.0) STORED,

  -- Date keys for fast joins to dim_date
  started_date       DATE GENERATED ALWAYS AS (started_at::date) STORED,
  ended_date         DATE GENERATED ALWAYS AS (ended_at::date) STORED,

  -- Basic data quality checks
  CONSTRAINT chk_positive_duration CHECK (ended_at >= started_at)
);

Helper mappings

Create helper mapping tables for faster upserts. Temporary mapping tables for bike table upserts.

-- 
CREATE TEMP TABLE tmp_bike_map AS
SELECT rideable_type, bike_type_id FROM divvy.dim_bike_type;

Temporary mapping tables for member table upserts.

CREATE TEMP TABLE tmp_member_map AS
SELECT member_casual, member_type_id FROM divvy.dim_member_type;

Upsert into fact table

Upsert from all monthly staging tables with dedup (3NF enforced by FKs & PK). Combine and insert with ON CONFLICT DO NOTHING to enforce uniqueness on ride_id.

INSERT INTO divvy.fact_trips
(ride_id, bike_type_id, member_type_id, started_at, ended_at,
 start_station_id, end_station_id)
SELECT
  t.ride_id,
  b.bike_type_id,
  mt.member_type_id,
  t.started_at,
  t.ended_at,
  t.start_station_id,
  t.end_station_id
FROM (
  SELECT * FROM divvy.january
  UNION ALL SELECT * FROM divvy.february
  UNION ALL SELECT * FROM divvy.march
  UNION ALL SELECT * FROM divvy.april
  UNION ALL SELECT * FROM divvy.may
  UNION ALL SELECT * FROM divvy.june
  UNION ALL SELECT * FROM divvy.july
  UNION ALL SELECT * FROM divvy.august
  UNION ALL SELECT * FROM divvy.september
  UNION ALL SELECT * FROM divvy.october
  UNION ALL SELECT * FROM divvy.november
  UNION ALL SELECT * FROM divvy.december
) t
LEFT JOIN tmp_bike_map b ON t.rideable_type = b.rideable_type
LEFT JOIN tmp_member_map mt ON t.member_casual  = mt.member_casual
-- Basic sanity: require started_at/ended_at and member/bike maps
WHERE t.ride_id IS NOT NULL
  AND t.started_at IS NOT NULL
  AND t.ended_at   IS NOT NULL
  AND t.ended_at > t.started_at
  AND mt.member_type_id IS NOT NULL
  AND b.bike_type_id   IS NOT NULL
ON CONFLICT (ride_id) DO NOTHING;

———————————————————————- The END ———————————————————————–